#Importing the data and files
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import seaborn as sns
from scipy import stats; from scipy.stats import zscore, norm, randint
import warnings
warnings.filterwarnings("ignore")
data= pd.read_csv('IHMStefanini_industrial_safety_and_health_database_with_accidents_description.csv')
print("Shape of the dataset is :",data.shape)
data.head()
Shape of the dataset is : (425, 11)
| Unnamed: 0 | Data | Countries | Local | Industry Sector | Accident Level | Potential Accident Level | Genre | Employee or Third Party | Critical Risk | Description | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 2016-01-01 00:00:00 | Country_01 | Local_01 | Mining | I | IV | Male | Third Party | Pressed | While removing the drill rod of the Jumbo 08 f... |
| 1 | 1 | 2016-01-02 00:00:00 | Country_02 | Local_02 | Mining | I | IV | Male | Employee | Pressurized Systems | During the activation of a sodium sulphide pum... |
| 2 | 2 | 2016-01-06 00:00:00 | Country_01 | Local_03 | Mining | I | III | Male | Third Party (Remote) | Manual Tools | In the sub-station MILPO located at level +170... |
| 3 | 3 | 2016-01-08 00:00:00 | Country_01 | Local_04 | Mining | I | I | Male | Third Party | Others | Being 9:45 am. approximately in the Nv. 1880 C... |
| 4 | 4 | 2016-01-10 00:00:00 | Country_01 | Local_04 | Mining | IV | IV | Male | Third Party | Others | Approximately at 11:45 a.m. in circumstances t... |
data.drop("Unnamed: 0", axis=1, inplace=True)
data.isnull().sum()
Data 0 Countries 0 Local 0 Industry Sector 0 Accident Level 0 Potential Accident Level 0 Genre 0 Employee or Third Party 0 Critical Risk 0 Description 0 dtype: int64
print("Shape of the dataset before duplicates deletion is :",data.shape)
print('Number of duplicates in the dataset :',data.duplicated().sum())
data.drop_duplicates(inplace=True)
print("Shape of the dataset after duplicates deletion is :",data.shape)
Shape of the dataset before duplicates deletion is : (425, 10) Number of duplicates in the dataset : 7 Shape of the dataset after duplicates deletion is : (418, 10)
print('********Checking the dtypes*********\n')
print(data.dtypes)
print('----------------------------------------------')
print('\n *********Checking the data info********* \n')
print(data.info())
********Checking the dtypes********* Data object Countries object Local object Industry Sector object Accident Level object Potential Accident Level object Genre object Employee or Third Party object Critical Risk object Description object dtype: object ---------------------------------------------- *********Checking the data info********* <class 'pandas.core.frame.DataFrame'> Int64Index: 418 entries, 0 to 424 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Data 418 non-null object 1 Countries 418 non-null object 2 Local 418 non-null object 3 Industry Sector 418 non-null object 4 Accident Level 418 non-null object 5 Potential Accident Level 418 non-null object 6 Genre 418 non-null object 7 Employee or Third Party 418 non-null object 8 Critical Risk 418 non-null object 9 Description 418 non-null object dtypes: object(10) memory usage: 35.9+ KB None
From the above, it is clearly evident that all the columns of the data frame are of the type object.
data.describe()
| Data | Countries | Local | Industry Sector | Accident Level | Potential Accident Level | Genre | Employee or Third Party | Critical Risk | Description | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 418 | 418 | 418 | 418 | 418 | 418 | 418 | 418 | 418 | 418 |
| unique | 287 | 3 | 12 | 3 | 5 | 6 | 2 | 3 | 33 | 411 |
| top | 2017-02-08 00:00:00 | Country_01 | Local_03 | Mining | I | IV | Male | Third Party | Others | In the geological reconnaissance activity, in ... |
| freq | 6 | 248 | 89 | 237 | 309 | 141 | 396 | 185 | 229 | 2 |
#Renaming the data, countries,genre, employee or third party to date, country, gender and nature of employee
data.rename(columns={'Data':'Date', 'Countries':'Country', 'Genre':'Gender', 'Employee or Third Party':'Natureofemployee'}, inplace=True)
data.head(3)
| Date | Country | Local | Industry Sector | Accident Level | Potential Accident Level | Gender | Natureofemployee | Critical Risk | Description | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2016-01-01 00:00:00 | Country_01 | Local_01 | Mining | I | IV | Male | Third Party | Pressed | While removing the drill rod of the Jumbo 08 f... |
| 1 | 2016-01-02 00:00:00 | Country_02 | Local_02 | Mining | I | IV | Male | Employee | Pressurized Systems | During the activation of a sodium sulphide pum... |
| 2 | 2016-01-06 00:00:00 | Country_01 | Local_03 | Mining | I | III | Male | Third Party (Remote) | Manual Tools | In the sub-station MILPO located at level +170... |
col = data[data.columns[~data.columns.isin(['Date','Description'])]].columns.tolist()
for cols in col:
print(f'Unique values for {cols} is \n{data[cols].unique()}\n')
Unique values for Country is ['Country_01' 'Country_02' 'Country_03'] Unique values for Local is ['Local_01' 'Local_02' 'Local_03' 'Local_04' 'Local_05' 'Local_06' 'Local_07' 'Local_08' 'Local_10' 'Local_09' 'Local_11' 'Local_12'] Unique values for Industry Sector is ['Mining' 'Metals' 'Others'] Unique values for Accident Level is ['I' 'IV' 'III' 'II' 'V'] Unique values for Potential Accident Level is ['IV' 'III' 'I' 'II' 'V' 'VI'] Unique values for Gender is ['Male' 'Female'] Unique values for Natureofemployee is ['Third Party' 'Employee' 'Third Party (Remote)'] Unique values for Critical Risk is ['Pressed' 'Pressurized Systems' 'Manual Tools' 'Others' 'Fall prevention (same level)' 'Chemical substances' 'Liquid Metal' 'Electrical installation' 'Confined space' 'Pressurized Systems / Chemical Substances' 'Blocking and isolation of energies' 'Suspended Loads' 'Poll' 'Cut' 'Fall' 'Bees' 'Fall prevention' '\nNot applicable' 'Traffic' 'Projection' 'Venomous Animals' 'Plates' 'Projection/Burning' 'remains of choco' 'Vehicles and Mobile Equipment' 'Projection/Choco' 'Machine Protection' 'Power lock' 'Burn' 'Projection/Manual Tools' 'Individual protection equipment' 'Electrical Shock' 'Projection of fragments']
replace_val = {'Local_01': 1, 'Local_02': 2, 'Local_03': 3, 'Local_04': 4, 'Local_05': 5, 'Local_06': 6, 'Local_07': 7, 'Local_08': 8, 'Local_09': 9, 'Local_10': 10, 'Local_11': 11, 'Local_12': 12}
data['Local'] = data['Local'].map(replace_val)
# replace_val = {'I': 1, 'II': 2, 'III': 3, 'IV': 4, 'V': 5}
# data['Accident Level'] = data['Accident Level'].map(replace_val)
# replace_val = {'I': 0, 'II': 1, 'III': 2, 'IV': 3, 'V': 4, 'VI': 5}
# data['Potential Accident Level'] = data['Potential Accident Level'].map(replace_val)
del replace_val
data.head(5)
| Date | Country | Local | Industry Sector | Accident Level | Potential Accident Level | Gender | Natureofemployee | Critical Risk | Description | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2016-01-01 00:00:00 | Country_01 | 1 | Mining | I | IV | Male | Third Party | Pressed | While removing the drill rod of the Jumbo 08 f... |
| 1 | 2016-01-02 00:00:00 | Country_02 | 2 | Mining | I | IV | Male | Employee | Pressurized Systems | During the activation of a sodium sulphide pum... |
| 2 | 2016-01-06 00:00:00 | Country_01 | 3 | Mining | I | III | Male | Third Party (Remote) | Manual Tools | In the sub-station MILPO located at level +170... |
| 3 | 2016-01-08 00:00:00 | Country_01 | 4 | Mining | I | I | Male | Third Party | Others | Being 9:45 am. approximately in the Nv. 1880 C... |
| 4 | 2016-01-10 00:00:00 | Country_01 | 4 | Mining | IV | IV | Male | Third Party | Others | Approximately at 11:45 a.m. in circumstances t... |
data['Date'] = pd.to_datetime(data['Date'])
data['Year'] = data['Date'].apply(lambda x : x.year)
data['Month'] = data['Date'].apply(lambda x : x.month)
data['Weekday'] = data['Date'].apply(lambda x : x.day_name())
data.head()
| Date | Country | Local | Industry Sector | Accident Level | Potential Accident Level | Gender | Natureofemployee | Critical Risk | Description | Year | Month | Weekday | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2016-01-01 | Country_01 | 1 | Mining | I | IV | Male | Third Party | Pressed | While removing the drill rod of the Jumbo 08 f... | 2016 | 1 | Friday |
| 1 | 2016-01-02 | Country_02 | 2 | Mining | I | IV | Male | Employee | Pressurized Systems | During the activation of a sodium sulphide pum... | 2016 | 1 | Saturday |
| 2 | 2016-01-06 | Country_01 | 3 | Mining | I | III | Male | Third Party (Remote) | Manual Tools | In the sub-station MILPO located at level +170... | 2016 | 1 | Wednesday |
| 3 | 2016-01-08 | Country_01 | 4 | Mining | I | I | Male | Third Party | Others | Being 9:45 am. approximately in the Nv. 1880 C... | 2016 | 1 | Friday |
| 4 | 2016-01-10 | Country_01 | 4 | Mining | IV | IV | Male | Third Party | Others | Approximately at 11:45 a.m. in circumstances t... | 2016 | 1 | Sunday |
# function to create month into seasons
def convert_to_season(x):
if x in [9, 10, 11]:
season = 'Spring'
elif x in [12, 1, 2]:
season = 'Summer'
elif x in [3, 4, 5]:
season = 'Autumn'
elif x in [6, 7, 8]:
season = 'Winter'
return season
data['Season'] = data['Month'].apply(convert_to_season)
data.head(3)
| Date | Country | Local | Industry Sector | Accident Level | Potential Accident Level | Gender | Natureofemployee | Critical Risk | Description | Year | Month | Weekday | Season | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2016-01-01 | Country_01 | 1 | Mining | I | IV | Male | Third Party | Pressed | While removing the drill rod of the Jumbo 08 f... | 2016 | 1 | Friday | Summer |
| 1 | 2016-01-02 | Country_02 | 2 | Mining | I | IV | Male | Employee | Pressurized Systems | During the activation of a sodium sulphide pum... | 2016 | 1 | Saturday | Summer |
| 2 | 2016-01-06 | Country_01 | 3 | Mining | I | III | Male | Third Party (Remote) | Manual Tools | In the sub-station MILPO located at level +170... | 2016 | 1 | Wednesday | Summer |
features=['Country', 'Local', 'Industry Sector', 'Accident Level',
'Potential Accident Level', 'Gender', 'Natureofemployee',
'Critical Risk', 'Year', 'Month', 'Weekday', 'Season']
def univariate_analysis_categorical(dataset,feature):
print("\n")
print("===========================================================================================")
print("Univariate Analysis of feature: ",feature)
print("===========================================================================================\n")
print("Unique values: ",feature)
print("-----------------")
print(dataset[feature].unique())
print("\n")
print("-----------------")
print("Countplot for feature: ",feature)
print("-----------------")
plt.figure(figsize=(10,6))
sns.countplot(dataset[feature],order = dataset[feature].value_counts().index)
plt.xticks(rotation = 'vertical')
plt.show()
print("-----------------")
print("Pie Chart for feature: ",feature)
print("------------------")
labels=dataset[feature].unique()
plt.figure(figsize=(10,6))
dataset[feature].value_counts().plot.pie(autopct="%.1f%%")
plt.show()
print("\n")
print("-----------------")
print("Histplot for feature: ",feature)
print("-------------------")
plt.figure(figsize=(10,6))
sns.histplot(dataset[feature])
plt.show()
print("\n")
print("-----------------")
print("Value Counts for feature: ",feature)
print("-------------------")
print(dataset[feature].value_counts().sort_values(ascending=False))
#!pip install pyqt5
univariate_analysis_categorical(data,'Country')
=========================================================================================== Univariate Analysis of feature: Country =========================================================================================== Unique values: Country ----------------- ['Country_01' 'Country_02' 'Country_03'] ----------------- Countplot for feature: Country -----------------
----------------- Pie Chart for feature: Country ------------------
----------------- Histplot for feature: Country -------------------
----------------- Value Counts for feature: Country ------------------- Country_01 248 Country_02 129 Country_03 41 Name: Country, dtype: int64
----- From the above plots, we can conclude the following
The country_01 has a count of about 248. Country _02 has a count of about 129. Country_03 has a count of about 41.
From the above pie chart, it can be infered that the country _01 is the most affected country with about 59% accidents and country_03 is the least affected country.
From the above output, the country_01 has maximum accidents and country_03 has minimum accidents.
#Count plot
univariate_analysis_categorical(data,'Local')
=========================================================================================== Univariate Analysis of feature: Local =========================================================================================== Unique values: Local ----------------- [ 1 2 3 4 5 6 7 8 10 9 11 12] ----------------- Countplot for feature: Local -----------------
----------------- Pie Chart for feature: Local ------------------
----------------- Histplot for feature: Local -------------------
----------------- Value Counts for feature: Local ------------------- 3 89 5 59 1 56 4 55 6 46 10 41 8 27 2 23 7 14 12 4 9 2 11 2 Name: Local, dtype: int64
univariate_analysis_categorical(data,'Industry Sector')
=========================================================================================== Univariate Analysis of feature: Industry Sector =========================================================================================== Unique values: Industry Sector ----------------- ['Mining' 'Metals' 'Others'] ----------------- Countplot for feature: Industry Sector -----------------
----------------- Pie Chart for feature: Industry Sector ------------------
----------------- Histplot for feature: Industry Sector -------------------
----------------- Value Counts for feature: Industry Sector ------------------- Mining 237 Metals 134 Others 47 Name: Industry Sector, dtype: int64
univariate_analysis_categorical(data,'Accident Level')
=========================================================================================== Univariate Analysis of feature: Accident Level =========================================================================================== Unique values: Accident Level ----------------- ['I' 'IV' 'III' 'II' 'V'] ----------------- Countplot for feature: Accident Level -----------------
----------------- Pie Chart for feature: Accident Level ------------------
----------------- Histplot for feature: Accident Level -------------------
----------------- Value Counts for feature: Accident Level ------------------- I 309 II 40 III 31 IV 30 V 8 Name: Accident Level, dtype: int64
univariate_analysis_categorical(data,'Potential Accident Level')
=========================================================================================== Univariate Analysis of feature: Potential Accident Level =========================================================================================== Unique values: Potential Accident Level ----------------- ['IV' 'III' 'I' 'II' 'V' 'VI'] ----------------- Countplot for feature: Potential Accident Level -----------------
----------------- Pie Chart for feature: Potential Accident Level ------------------
----------------- Histplot for feature: Potential Accident Level -------------------
----------------- Value Counts for feature: Potential Accident Level ------------------- IV 141 III 106 II 95 I 45 V 30 VI 1 Name: Potential Accident Level, dtype: int64
univariate_analysis_categorical(data,'Gender')
=========================================================================================== Univariate Analysis of feature: Gender =========================================================================================== Unique values: Gender ----------------- ['Male' 'Female'] ----------------- Countplot for feature: Gender -----------------
----------------- Pie Chart for feature: Gender ------------------
----------------- Histplot for feature: Gender -------------------
----------------- Value Counts for feature: Gender ------------------- Male 396 Female 22 Name: Gender, dtype: int64
univariate_analysis_categorical(data,'Natureofemployee')
=========================================================================================== Univariate Analysis of feature: Natureofemployee =========================================================================================== Unique values: Natureofemployee ----------------- ['Third Party' 'Employee' 'Third Party (Remote)'] ----------------- Countplot for feature: Natureofemployee -----------------
----------------- Pie Chart for feature: Natureofemployee ------------------
----------------- Histplot for feature: Natureofemployee -------------------
----------------- Value Counts for feature: Natureofemployee ------------------- Third Party 185 Employee 178 Third Party (Remote) 55 Name: Natureofemployee, dtype: int64
From the above it can be determined that the employee type of Third party are prone to accidents.
#Count plot
# plt.figure(figsize=(20,5))
# descending_order = data['Critical Risk'].value_counts().sort_values(ascending=False).index
# sns.countplot(x=data['Critical Risk'],order=descending_order);
# plt.xticks(rotation = 'vertical')
univariate_analysis_categorical(data,'Critical Risk')
=========================================================================================== Univariate Analysis of feature: Critical Risk =========================================================================================== Unique values: Critical Risk ----------------- ['Pressed' 'Pressurized Systems' 'Manual Tools' 'Others' 'Fall prevention (same level)' 'Chemical substances' 'Liquid Metal' 'Electrical installation' 'Confined space' 'Pressurized Systems / Chemical Substances' 'Blocking and isolation of energies' 'Suspended Loads' 'Poll' 'Cut' 'Fall' 'Bees' 'Fall prevention' '\nNot applicable' 'Traffic' 'Projection' 'Venomous Animals' 'Plates' 'Projection/Burning' 'remains of choco' 'Vehicles and Mobile Equipment' 'Projection/Choco' 'Machine Protection' 'Power lock' 'Burn' 'Projection/Manual Tools' 'Individual protection equipment' 'Electrical Shock' 'Projection of fragments'] ----------------- Countplot for feature: Critical Risk -----------------
----------------- Pie Chart for feature: Critical Risk ------------------
----------------- Histplot for feature: Critical Risk -------------------
----------------- Value Counts for feature: Critical Risk ------------------- Others 229 Pressed 24 Manual Tools 20 Chemical substances 17 Cut 14 Projection 13 Venomous Animals 13 Bees 10 Fall 9 Vehicles and Mobile Equipment 8 remains of choco 7 Fall prevention (same level) 7 Pressurized Systems 7 Suspended Loads 6 Fall prevention 6 Pressurized Systems / Chemical Substances 3 Power lock 3 Blocking and isolation of energies 3 Liquid Metal 3 Electrical Shock 2 Machine Protection 2 Projection/Choco 1 Burn 1 Projection/Manual Tools 1 Electrical installation 1 \nNot applicable 1 Poll 1 Projection/Burning 1 Plates 1 Individual protection equipment 1 Traffic 1 Projection of fragments 1 Confined space 1 Name: Critical Risk, dtype: int64
When we count the number of incidents by each type of critical risk, Others tops the list.
univariate_analysis_categorical(data,'Year')
=========================================================================================== Univariate Analysis of feature: Year =========================================================================================== Unique values: Year ----------------- [2016 2017] ----------------- Countplot for feature: Year -----------------
----------------- Pie Chart for feature: Year ------------------
----------------- Histplot for feature: Year -------------------
----------------- Value Counts for feature: Year ------------------- 2016 283 2017 135 Name: Year, dtype: int64
From the above, it is clearly evident that most accidents happend in year 2016. i.e- more than 250.
univariate_analysis_categorical(data,'Month')
=========================================================================================== Univariate Analysis of feature: Month =========================================================================================== Unique values: Month ----------------- [ 1 2 3 4 5 6 7 8 9 10 11 12] ----------------- Countplot for feature: Month -----------------
----------------- Pie Chart for feature: Month ------------------
----------------- Histplot for feature: Month -------------------
----------------- Value Counts for feature: Month ------------------- 2 61 4 51 6 51 3 50 5 40 1 39 7 24 9 24 12 23 8 21 10 21 11 13 Name: Month, dtype: int64
univariate_analysis_categorical(data,'Weekday')
=========================================================================================== Univariate Analysis of feature: Weekday =========================================================================================== Unique values: Weekday ----------------- ['Friday' 'Saturday' 'Wednesday' 'Sunday' 'Tuesday' 'Thursday' 'Monday'] ----------------- Countplot for feature: Weekday -----------------
----------------- Pie Chart for feature: Weekday ------------------
----------------- Histplot for feature: Weekday -------------------
----------------- Value Counts for feature: Weekday ------------------- Thursday 76 Tuesday 69 Wednesday 62 Friday 61 Saturday 56 Monday 53 Sunday 41 Name: Weekday, dtype: int64
sns.countplot(x="Accident Level",hue="Gender", data=data)
<AxesSubplot:xlabel='Accident Level', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Accident Level'],columns=data['Gender'])
print("------------------------------------------")
print("Cross table Analysis of features: ",'Accident Level',' and ', 'Gender')
print("------------------------------------------")
display(bivariate_analysis_df)
------------------------------------------ Cross table Analysis of features: Accident Level and Gender ------------------------------------------
| Gender | Female | Male |
|---|---|---|
| Accident Level | ||
| I | 18 | 291 |
| II | 3 | 37 |
| III | 1 | 30 |
| IV | 0 | 30 |
| V | 0 | 8 |
From the above count plot, it can be determined that the most of the accidents happened at level I with gender male.
sns.countplot(x="Potential Accident Level",hue="Gender", data=data)
<AxesSubplot:xlabel='Potential Accident Level', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Potential Accident Level'],columns=data['Gender'])
print("\n Cross table Analysis of features: ",'Potential Accident Level',' and ', 'Gender')
print("--------------------------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Potential Accident Level and Gender --------------------------------------------------------------------------
| Gender | Female | Male |
|---|---|---|
| Potential Accident Level | ||
| I | 0 | 45 |
| II | 14 | 81 |
| III | 3 | 103 |
| IV | 4 | 137 |
| V | 1 | 29 |
| VI | 0 | 1 |
From the above,it can be determined that most of the potential level accidents happened to male compared to female, of which Potential Accident Level of IV is dominant
sns.countplot(x="Country",hue="Gender", data=data)
<AxesSubplot:xlabel='Country', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Country'],columns=data['Gender'])
print("\n Cross table Analysis of features: ",'Country',' and ', 'Gender')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Country and Gender ------------------------------------------------------
| Gender | Female | Male |
|---|---|---|
| Country | ||
| Country_01 | 7 | 241 |
| Country_02 | 15 | 114 |
| Country_03 | 0 | 41 |
From the above countplot, it can be determined that the maximum number of accidents took place in country_01 to males and they are about 241.
#count plot to determine the number of accidents happened due to industry sector with their gender
sns.countplot(x="Industry Sector",hue="Gender", data=data)
<AxesSubplot:xlabel='Industry Sector', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Industry Sector'],columns=data['Gender'])
print("\n Cross table Analysis of features: ",'Industry Sector',' and ', 'Gender')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Industry Sector and Gender ------------------------------------------------------
| Gender | Female | Male |
|---|---|---|
| Industry Sector | ||
| Metals | 13 | 121 |
| Mining | 5 | 232 |
| Others | 4 | 43 |
From the above count plot, it is evident that most of the accidents happened to Male in the mining sector, around 232.
#Countplot to find in which max accidents took place to both female and male
sns.countplot(x="Year",hue="Gender", data=data)
<AxesSubplot:xlabel='Year', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Year'],columns=data['Gender'])
print("\n Cross table Analysis of features: ",'Year',' and ', 'Gender')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Year and Gender ------------------------------------------------------
| Gender | Female | Male |
|---|---|---|
| Year | ||
| 2016 | 14 | 269 |
| 2017 | 8 | 127 |
From the above countplot, it is clearly evident that maximum accidents took place in 2016 to the male when compared to female with a count of 269.
#Countplot to determine in which month the maximum accidents took place to both female and males
sns.countplot(x="Month",hue="Gender", data=data)
<AxesSubplot:xlabel='Month', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Month'],columns=data['Gender'])
print("\n Cross table Analysis of features: ",'Month',' and ', 'Gender')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Month and Gender ------------------------------------------------------
| Gender | Female | Male |
|---|---|---|
| Month | ||
| 1 | 0 | 39 |
| 2 | 4 | 57 |
| 3 | 1 | 49 |
| 4 | 1 | 50 |
| 5 | 4 | 36 |
| 6 | 1 | 50 |
| 7 | 4 | 20 |
| 8 | 3 | 18 |
| 9 | 3 | 21 |
| 10 | 0 | 21 |
| 11 | 0 | 13 |
| 12 | 1 | 22 |
From the above count plot, it is determined that maximum number of accidents happened to male in the month feb with a count 57.
#Countplot to find out on which day max accidents took place to the both genders
sns.countplot(x="Weekday",hue="Gender", data=data)
<AxesSubplot:xlabel='Weekday', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Weekday'],columns=data['Gender'])
print("\n Cross table Analysis of features: ",'Weekday',' and ', 'Gender')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Weekday and Gender ------------------------------------------------------
| Gender | Female | Male |
|---|---|---|
| Weekday | ||
| Friday | 2 | 59 |
| Monday | 4 | 49 |
| Saturday | 1 | 55 |
| Sunday | 2 | 39 |
| Thursday | 3 | 73 |
| Tuesday | 6 | 63 |
| Wednesday | 4 | 58 |
Max accidents happened to male on thursday with a count of more than 73
#count plot to determine which type of employee and gender faced most of the accidents
sns.countplot(x="Natureofemployee",hue="Gender", data=data)
<AxesSubplot:xlabel='Natureofemployee', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Natureofemployee'],columns=data['Gender'])
print("\n Cross table Analysis of features: ",'Natureofemployee',' and ', 'Gender')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Natureofemployee and Gender ------------------------------------------------------
| Gender | Female | Male |
|---|---|---|
| Natureofemployee | ||
| Employee | 8 | 170 |
| Third Party | 9 | 176 |
| Third Party (Remote) | 5 | 50 |
From the above output, it is clearly evident that maximum accidents happened to third party male employees. i.e- 176.
#count plot to determine which type of employee and gender faced most of the accidents
sns.countplot(x="Critical Risk",hue="Gender", data=data)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
plt.xticks(rotation=90)
(array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32]),
[Text(0, 0, 'Pressed'),
Text(1, 0, 'Pressurized Systems'),
Text(2, 0, 'Manual Tools'),
Text(3, 0, 'Others'),
Text(4, 0, 'Fall prevention (same level)'),
Text(5, 0, 'Chemical substances'),
Text(6, 0, 'Liquid Metal'),
Text(7, 0, 'Electrical installation'),
Text(8, 0, 'Confined space'),
Text(9, 0, 'Pressurized Systems / Chemical Substances'),
Text(10, 0, 'Blocking and isolation of energies'),
Text(11, 0, 'Suspended Loads'),
Text(12, 0, 'Poll'),
Text(13, 0, 'Cut'),
Text(14, 0, 'Fall'),
Text(15, 0, 'Bees'),
Text(16, 0, 'Fall prevention'),
Text(17, 0, '\nNot applicable'),
Text(18, 0, 'Traffic'),
Text(19, 0, 'Projection'),
Text(20, 0, 'Venomous Animals'),
Text(21, 0, 'Plates'),
Text(22, 0, 'Projection/Burning'),
Text(23, 0, 'remains of choco'),
Text(24, 0, 'Vehicles and Mobile Equipment'),
Text(25, 0, 'Projection/Choco'),
Text(26, 0, 'Machine Protection'),
Text(27, 0, 'Power lock'),
Text(28, 0, 'Burn'),
Text(29, 0, 'Projection/Manual Tools'),
Text(30, 0, 'Individual protection equipment'),
Text(31, 0, 'Electrical Shock'),
Text(32, 0, 'Projection of fragments')])
bivariate_analysis_df = pd.crosstab(index=data['Critical Risk'],columns=data['Gender'])
print("\n Cross table Analysis of features: ",'Critical Risk',' and ', 'Gender')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Critical Risk and Gender ------------------------------------------------------
| Gender | Female | Male |
|---|---|---|
| Critical Risk | ||
| \nNot applicable | 0 | 1 |
| Bees | 0 | 10 |
| Blocking and isolation of energies | 0 | 3 |
| Burn | 0 | 1 |
| Chemical substances | 4 | 13 |
| Confined space | 0 | 1 |
| Cut | 3 | 11 |
| Electrical Shock | 0 | 2 |
| Electrical installation | 0 | 1 |
| Fall | 1 | 8 |
| Fall prevention | 0 | 6 |
| Fall prevention (same level) | 1 | 6 |
| Individual protection equipment | 0 | 1 |
| Liquid Metal | 0 | 3 |
| Machine Protection | 0 | 2 |
| Manual Tools | 1 | 19 |
| Others | 9 | 220 |
| Plates | 0 | 1 |
| Poll | 0 | 1 |
| Power lock | 0 | 3 |
| Pressed | 0 | 24 |
| Pressurized Systems | 1 | 6 |
| Pressurized Systems / Chemical Substances | 0 | 3 |
| Projection | 0 | 13 |
| Projection of fragments | 0 | 1 |
| Projection/Burning | 0 | 1 |
| Projection/Choco | 0 | 1 |
| Projection/Manual Tools | 0 | 1 |
| Suspended Loads | 0 | 6 |
| Traffic | 1 | 0 |
| Vehicles and Mobile Equipment | 0 | 8 |
| Venomous Animals | 1 | 12 |
| remains of choco | 0 | 7 |
Critical Risk of type "Others" is dominant across both Male and Female Genders
#Countplot to determine the accident level happened at different industry sectors
sns.countplot(x="Industry Sector",hue="Accident Level", data=data)
<AxesSubplot:xlabel='Industry Sector', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Industry Sector'],columns=data['Accident Level'])
print("\n Cross table Analysis of features: ",'Industry Sector',' and ', 'Accident Level')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Industry Sector and Accident Level ------------------------------------------------------
| Accident Level | I | II | III | IV | V |
|---|---|---|---|---|---|
| Industry Sector | |||||
| Metals | 107 | 12 | 7 | 7 | 1 |
| Mining | 163 | 26 | 20 | 21 | 7 |
| Others | 39 | 2 | 4 | 2 | 0 |
Maximum number of accidents happened in the mining sector with accident Level I. i.e- 163.
#Countplot to determine the potential accident level according to the industry sector
sns.countplot(x="Industry Sector",hue="Potential Accident Level", data=data)
<AxesSubplot:xlabel='Industry Sector', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Industry Sector'],columns=data['Potential Accident Level'])
print("\n Cross table Analysis of features: ",'Industry Sector',' and ', 'Potential Accident Level')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Industry Sector and Potential Accident Level ------------------------------------------------------
| Potential Accident Level | I | II | III | IV | V | VI |
|---|---|---|---|---|---|---|
| Industry Sector | ||||||
| Metals | 7 | 48 | 44 | 33 | 2 | 0 |
| Mining | 8 | 40 | 61 | 99 | 28 | 1 |
| Others | 30 | 7 | 1 | 9 | 0 | 0 |
Maximum number of accidents happened in the potential accident level 4 and mining sector with a count 99. Minimum number of accidents took place in the mining sector at a potential accident level 6.
#Countplot to determine the number of accidents taken place at the industry sector wrt critical risk
fig = plt.figure(figsize = (15, 7.2))
ax = fig.add_subplot(121)
sns.countplot(x = 'Critical Risk', data = data, ax = ax, orient = 'v',
hue = 'Industry Sector')
plt.legend(labels = data['Industry Sector'].unique())
plt.xticks(rotation = 90)
(array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32]),
[Text(0, 0, 'Pressed'),
Text(1, 0, 'Pressurized Systems'),
Text(2, 0, 'Manual Tools'),
Text(3, 0, 'Others'),
Text(4, 0, 'Fall prevention (same level)'),
Text(5, 0, 'Chemical substances'),
Text(6, 0, 'Liquid Metal'),
Text(7, 0, 'Electrical installation'),
Text(8, 0, 'Confined space'),
Text(9, 0, 'Pressurized Systems / Chemical Substances'),
Text(10, 0, 'Blocking and isolation of energies'),
Text(11, 0, 'Suspended Loads'),
Text(12, 0, 'Poll'),
Text(13, 0, 'Cut'),
Text(14, 0, 'Fall'),
Text(15, 0, 'Bees'),
Text(16, 0, 'Fall prevention'),
Text(17, 0, '\nNot applicable'),
Text(18, 0, 'Traffic'),
Text(19, 0, 'Projection'),
Text(20, 0, 'Venomous Animals'),
Text(21, 0, 'Plates'),
Text(22, 0, 'Projection/Burning'),
Text(23, 0, 'remains of choco'),
Text(24, 0, 'Vehicles and Mobile Equipment'),
Text(25, 0, 'Projection/Choco'),
Text(26, 0, 'Machine Protection'),
Text(27, 0, 'Power lock'),
Text(28, 0, 'Burn'),
Text(29, 0, 'Projection/Manual Tools'),
Text(30, 0, 'Individual protection equipment'),
Text(31, 0, 'Electrical Shock'),
Text(32, 0, 'Projection of fragments')])
bivariate_analysis_df = pd.crosstab(index=data['Critical Risk'],columns=data['Industry Sector'])
print("\n Cross table Analysis of features: ",'Critical Risk',' and ', 'Industry Sector')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Critical Risk and Industry Sector ------------------------------------------------------
| Industry Sector | Metals | Mining | Others |
|---|---|---|---|
| Critical Risk | |||
| \nNot applicable | 1 | 0 | 0 |
| Bees | 0 | 0 | 10 |
| Blocking and isolation of energies | 3 | 0 | 0 |
| Burn | 1 | 0 | 0 |
| Chemical substances | 15 | 2 | 0 |
| Confined space | 1 | 0 | 0 |
| Cut | 10 | 4 | 0 |
| Electrical Shock | 0 | 2 | 0 |
| Electrical installation | 0 | 1 | 0 |
| Fall | 2 | 5 | 2 |
| Fall prevention | 3 | 2 | 1 |
| Fall prevention (same level) | 6 | 1 | 0 |
| Individual protection equipment | 0 | 1 | 0 |
| Liquid Metal | 3 | 0 | 0 |
| Machine Protection | 2 | 0 | 0 |
| Manual Tools | 14 | 5 | 1 |
| Others | 33 | 176 | 20 |
| Plates | 1 | 0 | 0 |
| Poll | 0 | 0 | 1 |
| Power lock | 1 | 2 | 0 |
| Pressed | 17 | 7 | 0 |
| Pressurized Systems | 6 | 1 | 0 |
| Pressurized Systems / Chemical Substances | 3 | 0 | 0 |
| Projection | 4 | 9 | 0 |
| Projection of fragments | 0 | 1 | 0 |
| Projection/Burning | 1 | 0 | 0 |
| Projection/Choco | 0 | 0 | 1 |
| Projection/Manual Tools | 0 | 1 | 0 |
| Suspended Loads | 5 | 1 | 0 |
| Traffic | 0 | 0 | 1 |
| Vehicles and Mobile Equipment | 0 | 8 | 0 |
| Venomous Animals | 2 | 1 | 10 |
| remains of choco | 0 | 7 | 0 |
From the above count plot, it is evident that maximum number of accidents happened in mining with a critical risk of others. i.e- about 175
sns.countplot(x="Local",hue="Industry Sector", data=data)
<AxesSubplot:xlabel='Local', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Industry Sector'],columns=data['Local'])
print("\n Cross table Analysis of features: ",'Industry Sector',' and ', 'Local')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Industry Sector and Local ------------------------------------------------------
| Local | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Industry Sector | ||||||||||||
| Metals | 0 | 0 | 0 | 0 | 59 | 46 | 0 | 27 | 2 | 0 | 0 | 0 |
| Mining | 56 | 23 | 89 | 55 | 0 | 0 | 14 | 0 | 0 | 0 | 0 | 0 |
| Others | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 41 | 2 | 4 |
Many accidents happened with a local 3 and industrial sector mining. i.e- more than 80. Least accidents took place with local 11 and industrial sector others.
#Count plot to determine the number of accidents taken place in year 2016 and 2017 according to the industrial sector
sns.countplot(x="Year",hue="Industry Sector", data=data)
<AxesSubplot:xlabel='Year', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Industry Sector'],columns=data['Year'])
print("\n Cross table Analysis of features: ",'Industry Sector',' and ', 'Year')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Industry Sector and Year ------------------------------------------------------
| Year | 2016 | 2017 |
|---|---|---|
| Industry Sector | ||
| Metals | 97 | 37 |
| Mining | 159 | 78 |
| Others | 27 | 20 |
1.The number of accidents taken place in year 2016 for mining sector is 160.
2.The number of accidents taken place in year 2016 wrt metals sector is about 100.
3.The number of accidents taken place in the year 2016 wrt others sector is about 30. Hence, it can be determined that maximum accidents took place in mining sector in the year 2016.
4.The number of accidents taken place in the year 2017 wrt mining sector is 80.
5.The number of accidents taken place in the year 2017 wrt metals sector is about 40.
6.The number of accidents taken place in the year 2017 wrt others sector is 20.
Hence, it can be determined that max accidents took place in mining sector in the year 2017
#Count plot to determine the accidents taken place in all the months wrt industrial sector
sns.countplot(x="Industry Sector",hue="Month", data=data)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
<matplotlib.legend.Legend at 0x1a5073e6f40>
bivariate_analysis_df = pd.crosstab(index=data['Industry Sector'],columns=data['Month'])
print("\n Cross table Analysis of features: ",'Industry Sector',' and ', 'Month')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Industry Sector and Month ------------------------------------------------------
| Month | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Industry Sector | ||||||||||||
| Metals | 13 | 19 | 17 | 15 | 12 | 11 | 9 | 9 | 11 | 5 | 5 | 8 |
| Mining | 24 | 38 | 26 | 31 | 20 | 28 | 13 | 10 | 10 | 15 | 7 | 15 |
| Others | 2 | 4 | 7 | 5 | 8 | 12 | 2 | 2 | 3 | 1 | 1 | 0 |
Maximum number of accidents happened in the month feb and mining sector. The least number of accidents took place in the others sector and month december.
sns.countplot(x="Industry Sector",hue="Weekday", data=data)
<AxesSubplot:xlabel='Industry Sector', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Industry Sector'],columns=data['Weekday'])
print("\n Cross table Analysis of features: ",'Industry Sector',' and ', 'Weekday')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Industry Sector and Weekday ------------------------------------------------------
| Weekday | Friday | Monday | Saturday | Sunday | Thursday | Tuesday | Wednesday |
|---|---|---|---|---|---|---|---|
| Industry Sector | |||||||
| Metals | 20 | 24 | 6 | 15 | 25 | 21 | 23 |
| Mining | 40 | 26 | 44 | 25 | 37 | 36 | 29 |
| Others | 1 | 3 | 6 | 1 | 14 | 12 | 10 |
Maximum number of accidents hapenned on the day saturday in the mining sector. i.e- more than 40. The least number of accidents happened on the day sunday in the others sector.
sns.countplot(x="Industry Sector",hue="Country", data=data)
<AxesSubplot:xlabel='Industry Sector', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Industry Sector'],columns=data['Country'])
print("\n Cross table Analysis of features: ",'Industry Sector',' and ', 'Country')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Industry Sector and Country ------------------------------------------------------
| Country | Country_01 | Country_02 | Country_03 |
|---|---|---|---|
| Industry Sector | |||
| Metals | 46 | 88 | 0 |
| Mining | 200 | 37 | 0 |
| Others | 2 | 4 | 41 |
From the above count plot, it is evident that the maximum number of accidents took place in country_01 and mining sector.i.e- 200. The least number of accidents took place in country _01 and others sector.
sns.countplot(x="Industry Sector",hue="Natureofemployee", data=data)
<AxesSubplot:xlabel='Industry Sector', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Industry Sector'],columns=data['Natureofemployee'])
print("\n Cross table Analysis of features: ",'Industry Sector',' and ', 'Natureofemployee')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Industry Sector and Natureofemployee ------------------------------------------------------
| Natureofemployee | Employee | Third Party | Third Party (Remote) |
|---|---|---|---|
| Industry Sector | |||
| Metals | 76 | 31 | 27 |
| Mining | 89 | 120 | 28 |
| Others | 13 | 34 | 0 |
From the above count plot, it is clearly evident that the maximum accidents took place in the mining sector with the third party employee type. i.e- about 120. The least number of accidents took place in the others sectors with the nature of employee as employee.
sns.countplot(x="Country",hue="Year", data=data)
<AxesSubplot:xlabel='Country', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Country'],columns=data['Year'])
print("\n Cross table Analysis of features: ",'Country',' and ', 'Year')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Country and Year ------------------------------------------------------
| Year | 2016 | 2017 |
|---|---|---|
| Country | ||
| Country_01 | 174 | 74 |
| Country_02 | 86 | 43 |
| Country_03 | 23 | 18 |
From the above output, the following can be determined-
1.The number of accidents taken place in country_01 and year 2016 is 174.
2.The number of accidents taken place in country_01 and year 2017 is about 74.
3.The number of accidents taken place in country_02 and year 2016 is more than 86.
4.The number of accidents taken place in country_02 and year 2017 is about 43.
5.The number of accidents taken place in country_03 and year 2016 is about 23.
6.The number of accidents taken place in country_03 and year 2017 is about 18.
sns.countplot(x="Country",hue="Accident Level", data=data)
<AxesSubplot:xlabel='Country', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Country'],columns=data['Accident Level'])
print("\n Cross table Analysis of features: ",'Country',' and ', 'Accident Level')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Country and Accident Level ------------------------------------------------------
| Accident Level | I | II | III | IV | V |
|---|---|---|---|---|---|
| Country | |||||
| Country_01 | 177 | 19 | 21 | 23 | 8 |
| Country_02 | 98 | 19 | 7 | 5 | 0 |
| Country_03 | 34 | 2 | 3 | 2 | 0 |
From the above count plot, it is clearly evident that the maximum number of accidents took place in accident level 1 and country_01.
sns.countplot(x="Country",hue="Potential Accident Level", data=data)
<AxesSubplot:xlabel='Country', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Country'],columns=data['Potential Accident Level'])
print("\n Cross table Analysis of features: ",'Country',' and ', 'Potential Accident Level')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Country and Potential Accident Level ------------------------------------------------------
| Potential Accident Level | I | II | III | IV | V | VI |
|---|---|---|---|---|---|---|
| Country | ||||||
| Country_01 | 10 | 51 | 64 | 101 | 21 | 1 |
| Country_02 | 6 | 40 | 41 | 33 | 9 | 0 |
| Country_03 | 29 | 4 | 1 | 7 | 0 | 0 |
From the above plot, it is evident that the maximum accidents occurred in country_01 and potential accident level 3.
sns.countplot(x="Country",hue="Local", data=data)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
<matplotlib.legend.Legend at 0x1a507915bb0>
bivariate_analysis_df = pd.crosstab(index=data['Country'],columns=data['Local'])
print("\n Cross table Analysis of features: ",'Country',' and ', 'Local')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Country and Local ------------------------------------------------------
| Local | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Country | ||||||||||||
| Country_01 | 56 | 0 | 89 | 55 | 0 | 46 | 0 | 0 | 0 | 0 | 2 | 0 |
| Country_02 | 0 | 23 | 0 | 0 | 59 | 0 | 14 | 27 | 2 | 0 | 0 | 4 |
| Country_03 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 41 | 0 | 0 |
Country 1 is more dominant in local 3 region and least dominant in Local 12
sns.countplot(x="Country",hue="Natureofemployee", data=data)
<AxesSubplot:xlabel='Country', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Country'],columns=data['Natureofemployee'])
print("\n Cross table Analysis of features: ",'Country',' and ', 'Natureofemployee')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Country and Natureofemployee ------------------------------------------------------
| Natureofemployee | Employee | Third Party | Third Party (Remote) |
|---|---|---|---|
| Country | |||
| Country_01 | 87 | 138 | 23 |
| Country_02 | 84 | 13 | 32 |
| Country_03 | 7 | 34 | 0 |
Accidents in Country 01 is more dominant in Third Party type of employee, country 03 is least dominant in Third Party (Remote)
sns.countplot(x="Country",hue="Critical Risk", data=data)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
<matplotlib.legend.Legend at 0x1a507a6ec10>
bivariate_analysis_df = pd.crosstab(index=data['Critical Risk'],columns=data['Country'])
print("\n Cross table Analysis of features: ",'Country',' and ', 'Critical Risk')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Country and Critical Risk ------------------------------------------------------
| Country | Country_01 | Country_02 | Country_03 |
|---|---|---|---|
| Critical Risk | |||
| \nNot applicable | 0 | 1 | 0 |
| Bees | 0 | 0 | 10 |
| Blocking and isolation of energies | 1 | 2 | 0 |
| Burn | 0 | 1 | 0 |
| Chemical substances | 4 | 13 | 0 |
| Confined space | 0 | 1 | 0 |
| Cut | 5 | 9 | 0 |
| Electrical Shock | 2 | 0 | 0 |
| Electrical installation | 1 | 0 | 0 |
| Fall | 6 | 1 | 2 |
| Fall prevention | 3 | 2 | 1 |
| Fall prevention (same level) | 5 | 2 | 0 |
| Individual protection equipment | 1 | 0 | 0 |
| Liquid Metal | 0 | 3 | 0 |
| Machine Protection | 1 | 1 | 0 |
| Manual Tools | 7 | 12 | 1 |
| Others | 169 | 45 | 15 |
| Plates | 1 | 0 | 0 |
| Poll | 0 | 0 | 1 |
| Power lock | 3 | 0 | 0 |
| Pressed | 9 | 15 | 0 |
| Pressurized Systems | 1 | 6 | 0 |
| Pressurized Systems / Chemical Substances | 2 | 1 | 0 |
| Projection | 9 | 4 | 0 |
| Projection of fragments | 1 | 0 | 0 |
| Projection/Burning | 0 | 1 | 0 |
| Projection/Choco | 0 | 0 | 1 |
| Projection/Manual Tools | 1 | 0 | 0 |
| Suspended Loads | 3 | 3 | 0 |
| Traffic | 0 | 1 | 0 |
| Vehicles and Mobile Equipment | 7 | 1 | 0 |
| Venomous Animals | 0 | 3 | 10 |
| remains of choco | 6 | 1 | 0 |
Country 01 is more dominant in Others Critical Risk and Critical Risk is least dominant in Country 03
sns.countplot(x="Local",hue="Accident Level", data=data)
<AxesSubplot:xlabel='Local', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Accident Level'],columns=data['Local'])
print("\n Cross table Analysis of features: ",'Local',' and ', 'Accident Level')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Local and Accident Level ------------------------------------------------------
| Local | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Accident Level | ||||||||||||
| I | 45 | 14 | 65 | 30 | 51 | 36 | 9 | 19 | 1 | 34 | 1 | 4 |
| II | 1 | 6 | 8 | 9 | 6 | 1 | 2 | 5 | 0 | 2 | 0 | 0 |
| III | 5 | 2 | 5 | 7 | 2 | 3 | 1 | 1 | 1 | 3 | 1 | 0 |
| IV | 4 | 1 | 8 | 6 | 0 | 5 | 2 | 2 | 0 | 2 | 0 | 0 |
| V | 1 | 0 | 3 | 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
Accident level 1 is more dominant in Local 2 region with 65 accidents, while Accident Level V is least across all Locals
sns.countplot(x="Local",hue="Potential Accident Level", data=data)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
<matplotlib.legend.Legend at 0x1a5077f0100>
bivariate_analysis_df = pd.crosstab(index=data['Potential Accident Level'],columns=data['Local'])
print("\n Cross table Analysis of features: ",'Local',' and ', 'Potential Accident Level')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Local and Potential Accident Level ------------------------------------------------------
| Local | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Potential Accident Level | ||||||||||||
| I | 2 | 1 | 2 | 2 | 1 | 4 | 1 | 2 | 0 | 29 | 0 | 1 |
| II | 7 | 6 | 12 | 11 | 15 | 20 | 4 | 12 | 1 | 4 | 1 | 2 |
| III | 13 | 6 | 21 | 19 | 25 | 11 | 2 | 8 | 0 | 1 | 0 | 0 |
| IV | 30 | 7 | 41 | 19 | 18 | 10 | 2 | 4 | 1 | 7 | 1 | 1 |
| V | 4 | 3 | 12 | 4 | 0 | 1 | 5 | 1 | 0 | 0 | 0 | 0 |
| VI | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
Overall Local 3 is more prone to Multiple potential accidents, while local 12 is the least
sns.countplot(x="Local",hue="Natureofemployee", data=data)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
<matplotlib.legend.Legend at 0x1a507faeeb0>
bivariate_analysis_df = pd.crosstab(index=data['Natureofemployee'],columns=data['Local'])
print("\n Cross table Analysis of features: ",'Local',' and ', 'Natureofemployee')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Local and Natureofemployee ------------------------------------------------------
| Local | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Natureofemployee | ||||||||||||
| Employee | 23 | 11 | 30 | 14 | 37 | 18 | 11 | 19 | 2 | 7 | 2 | 4 |
| Third Party | 30 | 1 | 48 | 40 | 9 | 20 | 1 | 2 | 0 | 34 | 0 | 0 |
| Third Party (Remote) | 3 | 11 | 11 | 1 | 13 | 8 | 2 | 6 | 0 | 0 | 0 | 0 |
Type Employee is more dominant across all Locals, while Type Third Party(Remote) is least dominant across all Locals
sns.countplot(x="Local",hue="Critical Risk", data=data)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
<matplotlib.legend.Legend at 0x1a50802b970>
bivariate_analysis_df = pd.crosstab(index=data['Critical Risk'],columns=data['Local'])
print("\n Cross table Analysis of features: ",'Local',' and ', 'Critical Risk')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Local and Critical Risk ------------------------------------------------------
| Local | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Critical Risk | ||||||||||||
| \nNot applicable | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| Bees | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | 0 | 0 |
| Blocking and isolation of energies | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 |
| Burn | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Chemical substances | 1 | 1 | 0 | 0 | 11 | 3 | 0 | 1 | 0 | 0 | 0 | 0 |
| Confined space | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Cut | 0 | 0 | 2 | 1 | 7 | 2 | 1 | 1 | 0 | 0 | 0 | 0 |
| Electrical Shock | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Electrical installation | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Fall | 2 | 0 | 1 | 2 | 1 | 1 | 0 | 0 | 0 | 2 | 0 | 0 |
| Fall prevention | 0 | 0 | 1 | 0 | 1 | 2 | 1 | 0 | 0 | 1 | 0 | 0 |
| Fall prevention (same level) | 0 | 0 | 0 | 1 | 2 | 4 | 0 | 0 | 0 | 0 | 0 | 0 |
| Individual protection equipment | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Liquid Metal | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| Machine Protection | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| Manual Tools | 1 | 2 | 1 | 1 | 5 | 4 | 0 | 4 | 1 | 1 | 0 | 0 |
| Others | 41 | 16 | 68 | 43 | 10 | 15 | 8 | 8 | 0 | 15 | 2 | 3 |
| Plates | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| Poll | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| Power lock | 0 | 0 | 2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| Pressed | 2 | 2 | 1 | 2 | 6 | 4 | 0 | 7 | 0 | 0 | 0 | 0 |
| Pressurized Systems | 0 | 1 | 0 | 0 | 3 | 1 | 0 | 2 | 0 | 0 | 0 | 0 |
| Pressurized Systems / Chemical Substances | 0 | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| Projection | 3 | 0 | 4 | 0 | 1 | 2 | 2 | 1 | 0 | 0 | 0 | 0 |
| Projection of fragments | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Projection/Burning | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Projection/Choco | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| Projection/Manual Tools | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Suspended Loads | 1 | 0 | 0 | 0 | 3 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| Traffic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| Vehicles and Mobile Equipment | 4 | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Venomous Animals | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 10 | 0 | 0 |
| remains of choco | 0 | 0 | 4 | 2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
Critical Risk of type "Others" is dominant across all Locals
sns.countplot(x="Local",hue="Year", data=data)
<AxesSubplot:xlabel='Local', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Year'],columns=data['Local'])
print("\n Cross table Analysis of features: ",'Local',' and ', 'Year')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Local and Year ------------------------------------------------------
| Local | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Year | ||||||||||||
| 2016 | 38 | 13 | 63 | 38 | 45 | 33 | 7 | 18 | 1 | 23 | 2 | 2 |
| 2017 | 18 | 10 | 26 | 17 | 14 | 13 | 7 | 9 | 1 | 18 | 0 | 2 |
Year 2016 has more accidents across all Local regions compared to 2017
sns.countplot(x="Accident Level",hue="Potential Accident Level", data=data)
<AxesSubplot:xlabel='Accident Level', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Potential Accident Level'],columns=data['Accident Level'])
print("\n Cross table Analysis of features: ",'Accident Level',' and ', 'Potential Accident Level')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Accident Level and Potential Accident Level ------------------------------------------------------
| Accident Level | I | II | III | IV | V |
|---|---|---|---|---|---|
| Potential Accident Level | |||||
| I | 45 | 0 | 0 | 0 | 0 |
| II | 88 | 7 | 0 | 0 | 0 |
| III | 89 | 14 | 3 | 0 | 0 |
| IV | 78 | 16 | 26 | 21 | 0 |
| V | 9 | 3 | 2 | 9 | 7 |
| VI | 0 | 0 | 0 | 0 | 1 |
Accident Level I is more related to Potential Accident levels of I, II, III, IV, V, VI
sns.countplot(x="Accident Level",hue="Natureofemployee", data=data)
<AxesSubplot:xlabel='Accident Level', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Accident Level'],columns=data['Natureofemployee'])
print("\n Cross table Analysis of features: ",'Accident Level',' and ', 'Natureofemployee')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Accident Level and Natureofemployee ------------------------------------------------------
| Natureofemployee | Employee | Third Party | Third Party (Remote) |
|---|---|---|---|
| Accident Level | |||
| I | 139 | 130 | 40 |
| II | 15 | 19 | 6 |
| III | 14 | 14 | 3 |
| IV | 10 | 16 | 4 |
| V | 0 | 6 | 2 |
Accident Level I is more dominant across all Employee types, where Level V is least across all types
sns.countplot(x="Accident Level",hue="Critical Risk", data=data)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
<matplotlib.legend.Legend at 0x1a5088c3400>
bivariate_analysis_df = pd.crosstab(index=data['Critical Risk'],columns=data['Accident Level'])
print("\n Cross table Analysis of features: ",'Accident Level',' and ', 'Critical Risk')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Accident Level and Critical Risk ------------------------------------------------------
| Accident Level | I | II | III | IV | V |
|---|---|---|---|---|---|
| Critical Risk | |||||
| \nNot applicable | 0 | 0 | 0 | 1 | 0 |
| Bees | 10 | 0 | 0 | 0 | 0 |
| Blocking and isolation of energies | 3 | 0 | 0 | 0 | 0 |
| Burn | 0 | 0 | 1 | 0 | 0 |
| Chemical substances | 15 | 2 | 0 | 0 | 0 |
| Confined space | 1 | 0 | 0 | 0 | 0 |
| Cut | 11 | 2 | 1 | 0 | 0 |
| Electrical Shock | 2 | 0 | 0 | 0 | 0 |
| Electrical installation | 0 | 0 | 0 | 1 | 0 |
| Fall | 6 | 0 | 0 | 2 | 1 |
| Fall prevention | 5 | 0 | 0 | 1 | 0 |
| Fall prevention (same level) | 6 | 0 | 0 | 1 | 0 |
| Individual protection equipment | 0 | 1 | 0 | 0 | 0 |
| Liquid Metal | 3 | 0 | 0 | 0 | 0 |
| Machine Protection | 2 | 0 | 0 | 0 | 0 |
| Manual Tools | 12 | 5 | 3 | 0 | 0 |
| Others | 169 | 21 | 23 | 13 | 3 |
| Plates | 1 | 0 | 0 | 0 | 0 |
| Poll | 0 | 0 | 0 | 1 | 0 |
| Power lock | 0 | 0 | 0 | 1 | 2 |
| Pressed | 17 | 1 | 2 | 4 | 0 |
| Pressurized Systems | 6 | 1 | 0 | 0 | 0 |
| Pressurized Systems / Chemical Substances | 2 | 1 | 0 | 0 | 0 |
| Projection | 10 | 2 | 0 | 1 | 0 |
| Projection of fragments | 1 | 0 | 0 | 0 | 0 |
| Projection/Burning | 0 | 1 | 0 | 0 | 0 |
| Projection/Choco | 1 | 0 | 0 | 0 | 0 |
| Projection/Manual Tools | 1 | 0 | 0 | 0 | 0 |
| Suspended Loads | 4 | 0 | 1 | 1 | 0 |
| Traffic | 1 | 0 | 0 | 0 | 0 |
| Vehicles and Mobile Equipment | 5 | 1 | 0 | 1 | 1 |
| Venomous Animals | 13 | 0 | 0 | 0 | 0 |
| remains of choco | 2 | 2 | 0 | 2 | 1 |
Accident Level I is more domaint with Other critical Risk type
sns.countplot(x="Accident Level",hue="Year", data=data)
<AxesSubplot:xlabel='Accident Level', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Year'],columns=data['Accident Level'])
print("\n Cross table Analysis of features: ",'Accident Level',' and ', 'Year')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Accident Level and Year ------------------------------------------------------
| Accident Level | I | II | III | IV | V |
|---|---|---|---|---|---|
| Year | |||||
| 2016 | 211 | 26 | 24 | 19 | 3 |
| 2017 | 98 | 14 | 7 | 11 | 5 |
Accident Level I is more dominant in across 2016 and 2017 years, and Level V is minimum
sns.countplot(x="Month",hue="Accident Level", data=data)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
<matplotlib.legend.Legend at 0x1a508b1de80>
bivariate_analysis_df = pd.crosstab(index=data['Accident Level'],columns=data['Month'])
print("\n Cross table Analysis of features: ",'Accident Level',' and ', 'Month')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Accident Level and Month ------------------------------------------------------
| Month | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Accident Level | ||||||||||||
| I | 32 | 42 | 34 | 43 | 31 | 41 | 16 | 15 | 18 | 11 | 9 | 17 |
| II | 2 | 9 | 7 | 2 | 3 | 3 | 1 | 3 | 3 | 4 | 1 | 2 |
| III | 2 | 4 | 3 | 3 | 1 | 2 | 4 | 2 | 2 | 4 | 1 | 3 |
| IV | 2 | 5 | 3 | 3 | 4 | 4 | 2 | 1 | 1 | 2 | 2 | 1 |
| V | 1 | 1 | 3 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 |
Accident Level 1 dominates across all Months while Level V is minimum
sns.countplot(x="Country",hue="Accident Level", data=data)
<AxesSubplot:xlabel='Country', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Accident Level'],columns=data['Country'])
print("\n Cross table Analysis of features: ",'Accident Level',' and ', 'Country')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Accident Level and Country ------------------------------------------------------
| Country | Country_01 | Country_02 | Country_03 |
|---|---|---|---|
| Accident Level | |||
| I | 177 | 98 | 34 |
| II | 19 | 19 | 2 |
| III | 21 | 7 | 3 |
| IV | 23 | 5 | 2 |
| V | 8 | 0 | 0 |
Accident Level I is more dominant across all Countries, while Accident Level V is least dominant across all countries
sns.countplot(x="Potential Accident Level",hue="Natureofemployee", data=data)
<AxesSubplot:xlabel='Potential Accident Level', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Potential Accident Level'],columns=data['Natureofemployee'])
print("\n Cross table Analysis of features: ",'Potential Accident Level',' and ', 'Natureofemployee')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Potential Accident Level and Natureofemployee ------------------------------------------------------
| Natureofemployee | Employee | Third Party | Third Party (Remote) |
|---|---|---|---|
| Potential Accident Level | |||
| I | 12 | 29 | 4 |
| II | 44 | 37 | 14 |
| III | 53 | 35 | 18 |
| IV | 58 | 68 | 15 |
| V | 11 | 15 | 4 |
| VI | 0 | 1 | 0 |
Potential Accident level IV dominents in ThirdParty, while VI is least dominant in Third Party(Remote) across all
sns.countplot(x="Potential Accident Level",hue="Critical Risk", data=data)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
<matplotlib.legend.Legend at 0x1a509f5b790>
bivariate_analysis_df = pd.crosstab(index=data['Critical Risk'],columns=data['Potential Accident Level'])
print("\n Cross table Analysis of features: ",'Potential Accident Level',' and ', 'Critical Risk')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Potential Accident Level and Critical Risk ------------------------------------------------------
| Potential Accident Level | I | II | III | IV | V | VI |
|---|---|---|---|---|---|---|
| Critical Risk | ||||||
| \nNot applicable | 0 | 0 | 0 | 0 | 1 | 0 |
| Bees | 10 | 0 | 0 | 0 | 0 | 0 |
| Blocking and isolation of energies | 0 | 1 | 2 | 0 | 0 | 0 |
| Burn | 0 | 0 | 0 | 1 | 0 | 0 |
| Chemical substances | 0 | 5 | 8 | 4 | 0 | 0 |
| Confined space | 0 | 0 | 1 | 0 | 0 | 0 |
| Cut | 1 | 6 | 5 | 2 | 0 | 0 |
| Electrical Shock | 0 | 0 | 0 | 2 | 0 | 0 |
| Electrical installation | 0 | 0 | 0 | 0 | 1 | 0 |
| Fall | 1 | 1 | 4 | 2 | 1 | 0 |
| Fall prevention | 1 | 0 | 0 | 5 | 0 | 0 |
| Fall prevention (same level) | 1 | 1 | 3 | 2 | 0 | 0 |
| Individual protection equipment | 0 | 0 | 0 | 1 | 0 | 0 |
| Liquid Metal | 1 | 0 | 0 | 2 | 0 | 0 |
| Machine Protection | 0 | 0 | 2 | 0 | 0 | 0 |
| Manual Tools | 2 | 5 | 9 | 4 | 0 | 0 |
| Others | 16 | 60 | 53 | 85 | 15 | 0 |
| Plates | 0 | 1 | 0 | 0 | 0 | 0 |
| Poll | 0 | 0 | 0 | 1 | 0 | 0 |
| Power lock | 0 | 0 | 0 | 0 | 3 | 0 |
| Pressed | 2 | 5 | 9 | 7 | 1 | 0 |
| Pressurized Systems | 0 | 2 | 3 | 2 | 0 | 0 |
| Pressurized Systems / Chemical Substances | 0 | 1 | 0 | 2 | 0 | 0 |
| Projection | 0 | 2 | 2 | 7 | 2 | 0 |
| Projection of fragments | 0 | 0 | 0 | 1 | 0 | 0 |
| Projection/Burning | 0 | 0 | 0 | 1 | 0 | 0 |
| Projection/Choco | 0 | 1 | 0 | 0 | 0 | 0 |
| Projection/Manual Tools | 0 | 0 | 1 | 0 | 0 | 0 |
| Suspended Loads | 0 | 1 | 0 | 5 | 0 | 0 |
| Traffic | 0 | 1 | 0 | 0 | 0 | 0 |
| Vehicles and Mobile Equipment | 0 | 0 | 2 | 2 | 4 | 0 |
| Venomous Animals | 10 | 2 | 1 | 0 | 0 | 0 |
| remains of choco | 0 | 0 | 1 | 3 | 2 | 1 |
Among all Critical Risk with Type as "Others" is dominant across all Potential Accident Levels
plt.figure(figsize=(10,6))
sns.barplot(data['Accident Level'], data['Month'], hue=data['Year'], palette='muted')
<AxesSubplot:xlabel='Accident Level', ylabel='Month'>
sns.countplot(x="Potential Accident Level",hue="Year", data=data)
<AxesSubplot:xlabel='Potential Accident Level', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Potential Accident Level'],columns=data['Year'])
print("\n Cross table Analysis of features: ",'Potential Accident Level',' and ', 'Year')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Potential Accident Level and Year ------------------------------------------------------
| Year | 2016 | 2017 |
|---|---|---|
| Potential Accident Level | ||
| I | 26 | 19 |
| II | 69 | 26 |
| III | 75 | 31 |
| IV | 97 | 44 |
| V | 16 | 14 |
| VI | 0 | 1 |
There is Decrease in Number of accidents across all Potential Accident level from 2016 to 2017. Potential Accident level IV is dominant in both 2016 and 2017
sns.countplot(x="Potential Accident Level",hue="Country", data=data)
<AxesSubplot:xlabel='Potential Accident Level', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Potential Accident Level'],columns=data['Country'])
print("\n Cross table Analysis of features: ",'Potential Accident Level',' and ', 'Country')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Potential Accident Level and Country ------------------------------------------------------
| Country | Country_01 | Country_02 | Country_03 |
|---|---|---|---|
| Potential Accident Level | |||
| I | 10 | 6 | 29 |
| II | 51 | 40 | 4 |
| III | 64 | 41 | 1 |
| IV | 101 | 33 | 7 |
| V | 21 | 9 | 0 |
| VI | 1 | 0 | 0 |
Potential Accident Level IV is dominant across all countries, while with VI least number of accidents happenned
sns.countplot(x="Natureofemployee",hue="Critical Risk", data=data)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
<matplotlib.legend.Legend at 0x1a50a4a3310>
bivariate_analysis_df = pd.crosstab(index=data['Critical Risk'],columns=data['Natureofemployee'])
print("\n Cross table Analysis of features: ",'Critical Risk',' and ', 'Natureofemployee')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Critical Risk and Natureofemployee ------------------------------------------------------
| Natureofemployee | Employee | Third Party | Third Party (Remote) |
|---|---|---|---|
| Critical Risk | |||
| \nNot applicable | 1 | 0 | 0 |
| Bees | 1 | 9 | 0 |
| Blocking and isolation of energies | 2 | 0 | 1 |
| Burn | 1 | 0 | 0 |
| Chemical substances | 9 | 2 | 6 |
| Confined space | 1 | 0 | 0 |
| Cut | 8 | 5 | 1 |
| Electrical Shock | 0 | 0 | 2 |
| Electrical installation | 0 | 1 | 0 |
| Fall | 0 | 5 | 4 |
| Fall prevention | 2 | 3 | 1 |
| Fall prevention (same level) | 3 | 4 | 0 |
| Individual protection equipment | 0 | 1 | 0 |
| Liquid Metal | 3 | 0 | 0 |
| Machine Protection | 1 | 1 | 0 |
| Manual Tools | 7 | 7 | 6 |
| Others | 99 | 109 | 21 |
| Plates | 1 | 0 | 0 |
| Poll | 0 | 1 | 0 |
| Power lock | 0 | 0 | 3 |
| Pressed | 12 | 7 | 5 |
| Pressurized Systems | 4 | 1 | 2 |
| Pressurized Systems / Chemical Substances | 1 | 1 | 1 |
| Projection | 7 | 6 | 0 |
| Projection of fragments | 0 | 1 | 0 |
| Projection/Burning | 1 | 0 | 0 |
| Projection/Choco | 0 | 1 | 0 |
| Projection/Manual Tools | 0 | 1 | 0 |
| Suspended Loads | 4 | 1 | 1 |
| Traffic | 1 | 0 | 0 |
| Vehicles and Mobile Equipment | 3 | 5 | 0 |
| Venomous Animals | 3 | 9 | 1 |
| remains of choco | 3 | 4 | 0 |
Critical Risk of type "Others" is dominant across all Types of Employees
sns.countplot(x="Month",hue="Year", data=data)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
<matplotlib.legend.Legend at 0x1a50a51aca0>
bivariate_analysis_df = pd.crosstab(index=data['Month'],columns=data['Year'])
print("\n Cross table Analysis of features: ",'Month',' and ', 'Year')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Month and Year ------------------------------------------------------
| Year | 2016 | 2017 |
|---|---|---|
| Month | ||
| 1 | 12 | 27 |
| 2 | 31 | 30 |
| 3 | 34 | 16 |
| 4 | 28 | 23 |
| 5 | 26 | 14 |
| 6 | 31 | 20 |
| 7 | 19 | 5 |
| 8 | 21 | 0 |
| 9 | 24 | 0 |
| 10 | 21 | 0 |
| 11 | 13 | 0 |
| 12 | 23 | 0 |
From the above plot, it is evident that the max accidents happened in the year 2016 and march.
sns.countplot(x="Local",hue="Year", data=data)
<AxesSubplot:xlabel='Local', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Year'],columns=data['Local'])
print("\n Cross table Analysis of features: ",'Local',' and ', 'Year')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Local and Year ------------------------------------------------------
| Local | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Year | ||||||||||||
| 2016 | 38 | 13 | 63 | 38 | 45 | 33 | 7 | 18 | 1 | 23 | 2 | 2 |
| 2017 | 18 | 10 | 26 | 17 | 14 | 13 | 7 | 9 | 1 | 18 | 0 | 2 |
From the above plot, it can be determined that the maximum accidents took place in the local 3 and year 2016.
sns.countplot(x="Weekday",hue="Year", data=data)
<AxesSubplot:xlabel='Weekday', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Year'],columns=data['Weekday'])
print("\n Cross table Analysis of features: ",'Weekday',' and ', 'Year')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Weekday and Year ------------------------------------------------------
| Weekday | Friday | Monday | Saturday | Sunday | Thursday | Tuesday | Wednesday |
|---|---|---|---|---|---|---|---|
| Year | |||||||
| 2016 | 44 | 40 | 36 | 25 | 58 | 40 | 40 |
| 2017 | 17 | 13 | 20 | 16 | 18 | 29 | 22 |
From the above plot, it is clearly evident that maximum number of accidents took place on thursday and year 2016.
sns.countplot(x="Year",hue="Critical Risk", data=data)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
<matplotlib.legend.Legend at 0x1a50aa04b20>
bivariate_analysis_df = pd.crosstab(index=data['Critical Risk'],columns=data['Year'])
print("\n Cross table Analysis of features: ",'Critical Risk',' and ', 'Year')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Critical Risk and Year ------------------------------------------------------
| Year | 2016 | 2017 |
|---|---|---|
| Critical Risk | ||
| \nNot applicable | 1 | 0 |
| Bees | 10 | 0 |
| Blocking and isolation of energies | 3 | 0 |
| Burn | 0 | 1 |
| Chemical substances | 13 | 4 |
| Confined space | 1 | 0 |
| Cut | 6 | 8 |
| Electrical Shock | 0 | 2 |
| Electrical installation | 1 | 0 |
| Fall | 2 | 7 |
| Fall prevention | 1 | 5 |
| Fall prevention (same level) | 6 | 1 |
| Individual protection equipment | 0 | 1 |
| Liquid Metal | 2 | 1 |
| Machine Protection | 0 | 2 |
| Manual Tools | 14 | 6 |
| Others | 189 | 40 |
| Plates | 1 | 0 |
| Poll | 1 | 0 |
| Power lock | 0 | 3 |
| Pressed | 14 | 10 |
| Pressurized Systems | 7 | 0 |
| Pressurized Systems / Chemical Substances | 3 | 0 |
| Projection | 1 | 12 |
| Projection of fragments | 0 | 1 |
| Projection/Burning | 0 | 1 |
| Projection/Choco | 0 | 1 |
| Projection/Manual Tools | 0 | 1 |
| Suspended Loads | 5 | 1 |
| Traffic | 1 | 0 |
| Vehicles and Mobile Equipment | 0 | 8 |
| Venomous Animals | 1 | 12 |
| remains of choco | 0 | 7 |
From the above plot, it is clearly evident that maximum number of accidents took place with "Others" and year 2016.
#Pair plot
sns.pairplot(data)
<seaborn.axisgrid.PairGrid at 0x1a50aa458b0>
data.corr()
| Local | Year | Month | |
|---|---|---|---|
| Local | 1.000000 | 0.054246 | 0.019061 |
| Year | 0.054246 | 1.000000 | -0.416621 |
| Month | 0.019061 | -0.416621 | 1.000000 |
From the above Correlation diagram its clear that "Local" and "Year" are moderately correlated
data.columns
Index(['Date', 'Country', 'Local', 'Industry Sector', 'Accident Level',
'Potential Accident Level', 'Gender', 'Natureofemployee',
'Critical Risk', 'Description', 'Year', 'Month', 'Weekday', 'Season'],
dtype='object')
data.groupby(['Year','Accident Level','Potential Accident Level'])[['Accident Level']].count()
| Accident Level | |||
|---|---|---|---|
| Year | Accident Level | Potential Accident Level | |
| 2016 | I | I | 26 |
| II | 62 | ||
| III | 64 | ||
| IV | 53 | ||
| V | 6 | ||
| II | II | 7 | |
| III | 9 | ||
| IV | 10 | ||
| III | III | 2 | |
| IV | 20 | ||
| V | 2 | ||
| IV | IV | 14 | |
| V | 5 | ||
| V | V | 3 | |
| 2017 | I | I | 19 |
| II | 26 | ||
| III | 25 | ||
| IV | 25 | ||
| V | 3 | ||
| II | III | 5 | |
| IV | 6 | ||
| V | 3 | ||
| III | III | 1 | |
| IV | 6 | ||
| IV | IV | 7 | |
| V | 4 | ||
| V | V | 4 | |
| VI | 1 |
Year 2016 with Accident Level I has maximum accidents of 64 with Potential Accident Level III and 62 with Potential Accident Level II
Year 2017 with Accident Level I has maximum accidents of 26 with Potential Accident Level II and 25 with Potential Accident Level III,IV
data.groupby(['Year','Industry Sector','Accident Level'])[['Accident Level']].count()
| Accident Level | |||
|---|---|---|---|
| Year | Industry Sector | Accident Level | |
| 2016 | Metals | I | 79 |
| II | 9 | ||
| III | 4 | ||
| IV | 5 | ||
| Mining | I | 112 | |
| II | 15 | ||
| III | 17 | ||
| IV | 12 | ||
| V | 3 | ||
| Others | I | 20 | |
| II | 2 | ||
| III | 3 | ||
| IV | 2 | ||
| 2017 | Metals | I | 28 |
| II | 3 | ||
| III | 3 | ||
| IV | 2 | ||
| V | 1 | ||
| Mining | I | 51 | |
| II | 11 | ||
| III | 3 | ||
| IV | 9 | ||
| V | 4 | ||
| Others | I | 19 | |
| III | 1 |
Year 2016 with Industry Sector of Type "Others" has maximum accidents of 20 with Accident Level I
Year 2017 with Industry Sector of Type "Metals" has maximum accidents of 28 with Accident Level I
data.groupby(['Industry Sector','Country','Accident Level'])[['Accident Level']].count()
| Accident Level | |||
|---|---|---|---|
| Industry Sector | Country | Accident Level | |
| Metals | Country_01 | I | 36 |
| II | 1 | ||
| III | 3 | ||
| IV | 5 | ||
| V | 1 | ||
| Country_02 | I | 71 | |
| II | 11 | ||
| III | 4 | ||
| IV | 2 | ||
| Mining | Country_01 | I | 140 |
| II | 18 | ||
| III | 17 | ||
| IV | 18 | ||
| V | 7 | ||
| Country_02 | I | 23 | |
| II | 8 | ||
| III | 3 | ||
| IV | 3 | ||
| Others | Country_01 | I | 1 |
| III | 1 | ||
| Country_02 | I | 4 | |
| Country_03 | I | 34 | |
| II | 2 | ||
| III | 3 | ||
| IV | 2 |
#!pip install wordcloud
#!pip install pandas_profiling
from wordcloud import WordCloud
for i in data['Accident Level'].unique():
print('WordCloud for Accident Level :', i,'\n')
text = " ".join(cat.split()[1] for cat in data[data['Accident Level'] == i]['Description'])
# Creating word_cloud with text as argument in .generate() method
word_cloud = WordCloud(collocations = False, background_color = 'lightyellow').generate(text)
# Display the generated Word Cloud
plt.figure(figsize=[10,10])
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis("off")
plt.show
print('-----------------------------')
WordCloud for Accident Level : I ----------------------------- WordCloud for Accident Level : IV ----------------------------- WordCloud for Accident Level : III ----------------------------- WordCloud for Accident Level : II ----------------------------- WordCloud for Accident Level : V -----------------------------
for i in data['Potential Accident Level'].unique():
if i != 'VI':
print('WordCloud for Potential Accident Level :', str(i),'\n')
text = " ".join(cat.split()[1] for cat in data[data['Potential Accident Level'] == i]['Description'])
# Creating word_cloud with text as argument in .generate() method
word_cloud = WordCloud(collocations = False, background_color = 'lightyellow').generate(text)
# Display the generated Word Cloud
plt.figure(figsize=[10,10])
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis("off")
plt.show
print('-----------------------------')
WordCloud for Potential Accident Level : IV ----------------------------- WordCloud for Potential Accident Level : III ----------------------------- WordCloud for Potential Accident Level : I ----------------------------- WordCloud for Potential Accident Level : II ----------------------------- WordCloud for Potential Accident Level : V -----------------------------
for i in data['Industry Sector'].unique():
print('WordCloud for Industry type :', str(i),'\n')
text = " ".join(cat.split()[1] for cat in data[data['Industry Sector'] == i]['Description'])
# Creating word_cloud with text as argument in .generate() method
word_cloud = WordCloud(collocations = False, background_color = 'lightyellow').generate(text)
# Display the generated Word Cloud
plt.figure(figsize=[10,10])
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis("off")
plt.show
print('-----------------------------')
WordCloud for Industry type : Mining ----------------------------- WordCloud for Industry type : Metals ----------------------------- WordCloud for Industry type : Others -----------------------------
for i in data['Country'].unique():
print('WordCloud for Country :', i,'\n')
text = " ".join(cat.split()[1] for cat in data[data['Country'] == i]['Description'])
# Creating word_cloud with text as argument in .generate() method
word_cloud = WordCloud(collocations = False, background_color = 'lightyellow').generate(text)
# Display the generated Word Cloud
plt.figure(figsize=[10,10])
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis("off")
plt.show
print('-----------------------------')
WordCloud for Country : Country_01 ----------------------------- WordCloud for Country : Country_02 ----------------------------- WordCloud for Country : Country_03 -----------------------------
from pandas_profiling import ProfileReport
profile = ProfileReport(data, title="Pandas Profiling Report")
profile.to_notebook_iframe()